Load required library
library(scater)
library(Seurat)
library(patchwork)
library(ggplot2)
library(dplyr)
library(reshape2)
library(ggbeeswarm)
library(gghighlight)
library(ggrepel)
library(viridis)
library(ggforce)
library(paletteer)
# Scater
library(scater)
library(scran)
library(dendextend)
library(dynamicTreeCut)
Specify the work space
workDir <- getwd()
workDir <- gsub("/scripts", "", workDir)
dataDir <- file.path(workDir, "data")
outDir <- file.path(workDir, "output")
# out data
outData_dir <- file.path(outDir, "out_data")
outDir_cur <- file.path(outDir, "Chapter_03_InHouse_YS")
dir.create(outDir_cur)
## Warning in dir.create(outDir_cur): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS' already exists
set.seed(123)
source(file.path(workDir, "functions.R"))
Load the integrated object
load(file=file.path(outData_dir, "ForIntegration_InHouse_Public.Rdata"))
We have read the original single cell exp object
se_next <- readRDS(file.path(outData_dir, "Se_Next_allGenes_ENSID.RDS"))
se_nova <- readRDS(file.path(outData_dir, "Se_Nova_allGenes_ENSID.RDS"))
Load INDEX sort data
# We know all cell sorted in this plate is Runx1 / Gfi1 reporter
df_inx_WN_S6 <- read.csv(file.path(dataDir, "INDEX_data/Metadata_WNs6_AGM060.csv")) %>%
dplyr::rename(FACS_well = OriWell) %>%
mutate(Plate = gsub("Plate", "", Plate),
SC_project = 'WN-S6',
UniqID = paste(SC_project, Plate, FACS_well, sep="_")) %>%
dplyr::select(UniqID, Population2, Runx1b_RFP, Gfi1_GFP) %>%
mutate(Allele = "RG")
# plate WN S7
df_inx_WN_S7 <- read.csv(file.path(dataDir, "INDEX_data/Metadata_WNs7_WNs8_AGM068.csv")) %>%
dplyr::rename(FACS_well = OriWell) %>%
mutate(tmp = Plate,
SC_project = gsub("_p[1-5]", "", tmp),
SC_project = gsub("_", "-", SC_project),
Plate = gsub(".*_p", "", tmp),
UniqID = paste(SC_project, Plate, FACS_well, sep="_")) %>%
dplyr::select(UniqID, Population2, Runx1b_RFP, Gfi1_GFP,FACS_well)
# We know which well is only the Gfi1 reporter (based on plate layout excel file)
Gfi_mouse_well <- c(
paste0(rep("G", each = ),sprintf("%02d",5:12)),
paste0(rep(LETTERS[8:15], each = 23),sprintf("%02d",1:12)),
paste0(rep("P", each=3),sprintf("%02d",10:13)))
df_inx_WN_S7 <- df_inx_WN_S7 %>%
mutate(Allele = if_else(FACS_well %in% Gfi_mouse_well, "G", "RG")) %>%
dplyr::select(-FACS_well)
df_indx <- rbind(df_inx_WN_S6, df_inx_WN_S7)
# Lets also read INDEX data for EMP and LMP
df_inx_WN_S10 <- read.csv(file.path(dataDir, "INDEX_data/Metadata_WNs10_AGM083.csv")) %>%
dplyr::rename(FACS_well = OriWell) %>%
mutate(Plate = "1",
SC_project = 'WN-S10',
UniqID = paste(SC_project, Plate, FACS_well, sep="_")) %>%
dplyr::select(UniqID, Population, CD127_eF450:LIN, CD127_PE:CD41_PECy7) %>%
mutate(Allele = "WT")
Load list of CC genes
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes
Read list of cell surface genes
df_cs <- read.delim(file.path(dataDir, "Others/RM_cur_CS_mouse.txt"), stringsAsFactors = FALSE)
outDir_inHouse6_scater <- file.path(outDir_cur, "Scater_analysis_YS_only")
dir.create(outDir_inHouse6_scater)
## Warning in dir.create(outDir_inHouse6_scater): '/Users/zfadlullah/Dropbox (The
## University of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/
## output/Chapter_03_InHouse_YS/Scater_analysis_YS_only' already exists
We use the Scater object where Gfi1/1b KO and TechRep were removed and re-run the analysis in scater Simply merge the two objects
We loaded the single cell object in the Set-up section. Simply merge the two objects
dat1 <- assay(se_next, "counts")
dat2 <- assay(se_nova, "counts")
dat <- cbind(dat1,dat2)
c.anno1 <- colData(se_next)
c.anno2 <- colData(se_nova)
c.anno <- rbind(c.anno1, c.anno2)
g.anno <- rowData(se_next)
colnames(g.anno)[5] <- "chr_name"
se <-
SingleCellExperiment(
assays = list(counts = dat),
colData = c.anno,
rowData = g.anno)
se$condition <- factor(se$condition, levels=com_popFac)
Retain only the YS cells and remove Gfi1/1b KOs
AnoSite <- se$condition
AnoSite <- gsub("_.*", "", AnoSite)
table(AnoSite)
## AnoSite
## AGM MES YS
## 702 94 1504
se$Site <- AnoSite
se <- se[, se$Site == "YS"]
# Remove Gfi1/1b KO
se <- se[,!se$condition == "YS_E9.5_HE_Gfi1s_KO"]
# Remove the KIT negative sorted cells
se <- se[,!se$condition %in% c("YS_E9.0_HE_Gfi1_Kneg", "YS_E9.5_HE_Gfi1_Kneg", "YS_E10_HE_Gfi1_Kneg")]
Remove tech duplciate
Now remove technical replicate, we select the ones with the most genes to keep.
cD <- as.data.frame(colData(se))
cD <- cD %>%
# Find the lane in which the sample was sequenced on
mutate(Lane = gsub("*.L001", "L1", FullName),
Lane = gsub(".*L002", "L2", Lane),
Lane = gsub("GL.*", "L1", Lane),
SeqBatch = paste(SeqRun, Lane, sep="_"),
# Find the Uniq ID
Plate_Well = paste(SC_project, SC_plate, FACS_well, sep="_"))
# Identify tech replicates
df_com_twice <- as_tibble(cD) %>%
dplyr::count(Plate_Well) %>% dplyr::filter(n > 1)
qc_cell <- perCellQCMetrics(se)
cD <- cbind(cD, qc_cell)
cD_noTech <- cD %>%
arrange(desc(detected)) %>%
distinct(Plate_Well,.keep_all = TRUE)
# Subset
se <- se[,se$Barcode %in% cD_noTech$Barcode]
Remove all ENS with 0 counts
row_sub <- rowSums(dat)
row_sub <- names(row_sub)[row_sub > 0]
Convert to gene names
se <- se[row_sub, ]
new.row.names <- uniquifyFeatureNames(
rowData(se)$ID,
rowData(se)$Symbol
)
rownames(se) <- new.row.names
head(rownames(se), 5)
## [1] "Xkr4" "Gm1992" "Gm19938" "Gm37381" "Rp1"
set.seed(123)
clust.se <- quickCluster(se)
se <- computeSumFactors(se, cluster=clust.se, min.mean=0.1)
se <- logNormCounts(se)
#assay(se,"logcounts")["Runx1",]
set.seed(123)
dec.se <- modelGeneVar(se)
# Visualizing the fit:
fit.se <- metadata(dec.se)
plot(fit.se$mean, fit.se$var, xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(fit.se$trend(x), col="dodgerblue", add=TRUE, lwd=2)
mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds",
package="scran"))
# Using Ensembl IDs to match up with the annotation in 'mm.pairs'.
assignments <- cyclone(se, mm.pairs, gene.names=rowData(se)$ID)
se$Phase <- assignments$phases
saveRDS(se, file.path(outDir_cur, "seNorm_YS.RDS"))
Select the most variable genes
set.seed(123)
top.prop <- getTopHVGs(dec.se, prop=0.1)
top.se <- getTopHVGs(dec.se, n=2000)
length(top.prop)
## [1] 967
length(top.se)
## [1] 2000
chosen <- top.se
Run dim reduction
se <- runPCA(se, subset_row=chosen)
percent.var <- attr(reducedDim(se), "percentVar")
plot(percent.var, log="y", xlab="PC", ylab="Variance explained (%)")
UMAP
set.seed(123)
se <- runUMAP(se, dimred="PCA", subset_row=chosen,
pca=30,
n_neighbors=30,
min_dist=0.4)
plotUMAP(se, colour_by="condition")
Graph based clustering
# Graph based
g <- buildSNNGraph(se, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
se$GraphBased_all <- paste0("g_", sprintf("%02d",clust))
Visualise UMAP better
umap.df.int <- reducedDim(se, "UMAP") %>%
cbind(colData(se)) %>%
as.data.frame() %>%
dplyr::rename(UMAP_1 = V1,
UMAP_2 = V2)
Plot
p1 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=SeqRun)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~SeqRun) +
theme_custom +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the Sequencing batch")
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~condition) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the FACS population")
p3 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased_all)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~GraphBased_all) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
p4 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=Phase)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~Phase) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cell cycle")
png(file.path(outDir_inHouse6_scater, "facet_SeqBatch.png"), height=350, width = 850)
p1
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater, "facet_FACS.png"), height=950, width = 1200)
p2
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater, "facet_cluster.png"), height=950, width = 1200)
p3
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater, "facet_CC.png"), height=350, width = 850)
p4
dev.off()
## quartz_off_screen
## 2
We can check the expression of the following genes
plotUMAP(se, colour_by=c("Folr1"))
plotUMAP(se, colour_by=c("Dlk1"))
plotUMAP(se, colour_by=c("Ptn"))
plotUMAP(se, colour_by=c("Hbb-y"))
Save the single cell object
seOri <- se
saveRDS(seOri, file.path(outDir_inHouse6_scater, "SeNotFiltered.RDS"))
#seOri <- readRDS(file.path(outDir_inHouse6_scater, "SeNotFiltered.RDS"))
outDir_inHouse6_scater_outlier <- file.path(outDir_inHouse6_scater, "Outlier_removed")
dir.create(outDir_inHouse6_scater_outlier)
## Warning in dir.create(outDir_inHouse6_scater_outlier): '/Users/zfadlullah/
## Dropbox (The University of Manchester)/zaki/PhD/sequencing_runs/
## 2021/2021_06_01_MZ_016/output/Chapter_03_InHouse_YS/Scater_analysis_YS_only/
## Outlier_removed' already exists
Things we remove ;
Folr1 - https://pubmed.ncbi.nlm.nih.gov/19180647/
By selecting regions on the UMAP we can remove the cells above
# Remove the Hbb clusters
toRemove_1 <- umap.df.int %>%
dplyr::filter(GraphBased_all == "g_12") %>%
pull(Barcode)
Recluster
se <- se[,!se$GraphBased_all == "g_12"]
g <- buildSNNGraph(se, k=5, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
length(unique(clust))
## [1] 13
se$GraphBased_all <- paste0("g_", sprintf("%02d",clust))
Re-plot
umap.df.int2 <- reducedDim(se, "UMAP") %>%
cbind(colData(se)) %>%
as.data.frame() %>%
dplyr::rename(UMAP_1 = V1,
UMAP_2 = V2)
p5 <-
ggplot(
umap.df.int2,
aes(x=UMAP_1, y=UMAP_2, colour=GraphBased_all)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~GraphBased_all) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
png(file.path(outDir_inHouse6_scater, "facet_cluster_noHbb.png"), height=950, width = 1200)
p5
dev.off()
## quartz_off_screen
## 2
Remove additional cells
plotUMAP(se, colour_by=c("Folr1"))
plotUMAP(se, colour_by=c("Dlk1"))
plotUMAP(se, colour_by=c("Ptn"))
# Remove MES (ptn expressing cells)
toRemove_2 <- umap.df.int2 %>%
dplyr::filter(GraphBased_all == "g_13") %>%
pull(Barcode)
# Remove Folr1 expressing cells
toRemove_3 <- umap.df.int2 %>%
dplyr::filter(GraphBased_all == "g_11") %>%
pull(Barcode)
Find cells in the cluster with high Ribo and low numebr of genes detected
#se <- seOri
rD <- as.data.frame(rowData(se))
# Specific the location
MT_gene <- rD$chr_name == "chrM"
Hbbs <- rD[grepl("Hb[a-b]", rD$Symbol),] %>% pull(Symbol)
Hbbs <- c(Hbbs, "Gypa")
Ribos_1 <- rD[grepl("Rps", rD$Symbol),] %>% pull(Symbol)
Ribos_2 <- rD[grepl("Rpl", rD$Symbol),] %>% pull(Symbol)
Ribos <- c(Ribos_1, Ribos_2)
# Get the Ens ID
Hb_ens <- rD %>%
dplyr::filter(Symbol %in% Hbbs)
Hbb_gene <- rowData(se)$ID %in% Hb_ens$ID
Ribo_ens <- rD %>%
dplyr::filter(Symbol %in% Ribos)
Ribo_gene <- rowData(se)$ID %in% Ribo_ens$ID
#We examine the distibution of the data in all the run
df_cData <-
perCellQCMetrics(se,
subsets=list(Mito=MT_gene,
Hbb=Hbb_gene,
Ribo=Ribo_gene)) %>%
cbind(as.data.frame(colData(se))) %>%
as.data.frame() %>%
dplyr::mutate(Cell = colnames(se))
se$detected <- df_cData$detected
se$RiboPerc <- df_cData$subsets_Ribo_percent
se$MitoPerc <- df_cData$subsets_Mito_percent
plotColData(se, x="GraphBased_all", y="MitoPerc")
plotColData(se, x="GraphBased_all", y="RiboPerc")
plotColData(se, x="GraphBased_all", y="detected")
# Remove the cluster with highest Ribo which also has lowest number of detected genes
toRemove_4 <- umap.df.int2 %>%
filter(GraphBased_all == "g_12") %>%
pull(Barcode)
Theres another cluster (g_10) that has low genes and high Riba as well.
# ----- Scran -------- #
# Quick check what is expressed in g_10
colLabels(se) <- se$GraphBased_all
# We can change the pval.type="all" to be more stringent
markers.se <- findMarkers(se, pval.type="some", direction="up")
chosen_clus <- "g_10"
interesting <- markers.se[[chosen_clus]]
#best.set <- interesting[interesting$Top <= 10,]
plotUMAP(se, colour_by=c("Nutf2-ps1"))
# ------ Seurat -------- #
seuset <- Seurat::as.Seurat(se)
seuset <- NormalizeData(seuset)
Idents(seuset) <- seuset$GraphBased_all
su.markers <-
Seurat::FindMarkers(seuset, only.pos = FALSE,
ident.1 = c("g_10"),
#ident.2 = c("C05"),
min.pct = 0.25, logfc.threshold = 0.25)
top_x <- su.markers %>%
tibble::rownames_to_column(var = "gene") %>%
dplyr::filter(p_val_adj < 0.05) %>%
top_n(n = 10, wt = avg_log2FC)
FeaturePlot(seuset, top_x$gene)
FeaturePlot(seuset, row.names(interesting)[1:10])
We remove this cluster as well
toRemove_5 <- umap.df.int2 %>%
filter(GraphBased_all == "g_10") %>%
pull(Barcode)
Remove the selected cells
toRemove <- unique(c(toRemove_1, toRemove_2, toRemove_3, toRemove_4, toRemove_5))
Plot the selected cells
umap.df.int3 <- umap.df.int %>%
dplyr::mutate(isSel = if_else(Barcode %in% toRemove, "Discard", "Retain"))
p1 <-
ggplot(umap.df.int3, aes(x=UMAP_1, y=UMAP_2, colour=isSel)) +
geom_point() +
#gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
#facet_wrap(~cluster) +
theme_custom
png(file.path(outDir_inHouse6_scater, "SelectedCells.png"), height=300, width = 450)
p1
dev.off()
## quartz_off_screen
## 2
Subset to the selected cells, and re-do the HVG selection, dim reduction etc
#seAll <- se
#se <- seAll
SelctedCells <- umap.df.int3 %>%
dplyr::filter(isSel == "Retain")
se <- se[,se$Barcode %in% SelctedCells$Barcode]
set.seed(123)
dec.se <- modelGeneVar(se)
top.se <- getTopHVGs(dec.se, n=2500)
chosen <- top.se
se <- runPCA(se, subset_row=chosen)
set.seed(123)
se <- runUMAP(se, dimred="PCA", subset_row=chosen,
pca=30,
n_neighbors=30,
min_dist=0.3)
plotUMAP(se, colour_by="condition")
UMAP when CC is removed
diff <- getVarianceExplained(se, "Phase")
discard <- diff > 5
top.hvgs2 <- getTopHVGs(se[which(!discard),], n=2500)
se.nocycle <- runPCA(se, subset_row=top.hvgs2)
set.seed(9999)
se.nocycle <- runUMAP(se.nocycle, dimred="PCA", subset_row=top.hvgs2,
pca=30,
n_neighbors=30,
min_dist=0.4)
plotUMAP(se.nocycle, colour_by="condition")
chosen2 <- top.hvgs2
Re-do clustering
# Hirarchical clustering
my.dist <- dist(reducedDim(se, "PCA")[,1:0])
my.tree <- hclust(my.dist, "ward.D2")
my.dend <- as.dendrogram(my.tree, hang=0.1)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0, minClusterSize=10, deepSplit=2))
#my.clusters <- paste0("k_", my.clusters)
length(unique(my.clusters))
# Making a prettier dendrogram.
my.tree$labels <- seq_along(my.tree$labels)
#dend <- as.dendrogram(my.tree, hang=0.1)
numClus <- length(unique(my.clusters))
coltree <- gg_color_hue(n=numClus)
my.tree_or <- my.clusters[order.dendrogram(my.dend)]
# Setting up colours for the FACS population
cFACS <- coltree[as.factor(se$TreeCut)]
cFACS <- coltree[as.factor(se$TreeCut)[order.dendrogram(dend)]]
no0_unique <-
function(x) {
u_x <- unique(x)
u_x[u_x != 0]
}
clusters_numbers <- no0_unique(my.tree_or)
n_clusters <- length(clusters_numbers)
cols <- gg_color_hue(n=n_clusters)
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or, values = cols) %>%
set("labels_col", "black")
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)
UniqClus <- unique(my.tree_or)
pdf(file.path(outDir_inHouse6_scater_outlier, "Dend_PCA.pdf"),height = 4, width = 12)
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)
plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)
dev.off()
se$TreeCut <- paste0("k_", sprintf("%02d",my.clusters))
# Hierarch by gene exp
chosen.exprs <- assay(se, "logcounts")[chosen,]
my.dist = dist(as.matrix(t(chosen.exprs)), method = "euclidean")
my.tree <- hclust(my.dist, method="ward.D2")
my.dend <- as.dendrogram(my.tree, hang=0.1)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0, minClusterSize=10, deepSplit=2))
#my.clusters <- paste0("k_", my.clusters)
length(unique(my.clusters))
## [1] 10
# Making a prettier dendrogram.
my.tree$labels <- seq_along(my.tree$labels)
#dend <- as.dendrogram(my.tree, hang=0.1)
numClus <- length(unique(my.clusters))
coltree <- gg_color_hue(n=numClus)
my.tree_or <- my.clusters[order.dendrogram(my.dend)]
# Setting up colours for the FACS population
cFACS <- coltree[as.factor(se$TreeCut)]
cFACS <- coltree[as.factor(se$TreeCut)[order.dendrogram(my.dend)]]
no0_unique <-
function(x) {
u_x <- unique(x)
u_x[u_x != 0]
}
clusters_numbers <- no0_unique(my.tree_or)
n_clusters <- length(clusters_numbers)
cols <- gg_color_hue(n=n_clusters)
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or, values = cols) %>%
set("labels_col", "black")
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)
UniqClus <- unique(my.tree_or)
pdf(file.path(outDir_inHouse6_scater_outlier, "Dend.pdf"),height = 4, width = 12)
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)
plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)
dev.off()
## quartz_off_screen
## 2
se$TreeCut <- paste0("k_", sprintf("%02d",my.clusters))
Graph based
# Re-run clusters
g <- buildSNNGraph(se, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
se$GraphBased <- paste0("g_", sprintf("%02d",clust))
Visualise UMAP better
umap.df.int <- reducedDim(se, "UMAP") %>%
cbind(colData(se)) %>%
as.data.frame() %>%
dplyr::rename(UMAP_1 = V1,
UMAP_2 = V2)
Plot
p1 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=SeqRun)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~SeqRun) +
theme_custom +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the Sequencing batch")
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~condition) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the FACS population")
p3 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~TreeCut) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
p3.5 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~GraphBased) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
p4 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=Phase)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~Phase) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cell cycle")
png(file.path(outDir_inHouse6_scater_outlier, "facet_SeqBatch.png"), height=350, width = 850)
p1
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_FACS.png"), height=950, width = 1200)
p2
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_cluster_tree.png"), height=950, width = 1200)
p3
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_cluster_graph_default.png"), height=950, width = 1200)
p3.5
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_inHouse6_scater_outlier, "facet_CC.png"), height=350, width = 850)
p4
dev.off()
## quartz_off_screen
## 2
Make a nice UMAP plot
umap.df.int <- umap.df.int %>%
dplyr::mutate(Estage = gsub("_Endo", "", condition),
Estage = gsub("_HE.*", "", Estage),
Estage = gsub("_EMP.*", "", Estage),
Estage = gsub("_LMP.*", "", Estage),
Estage = gsub("YS_", "", Estage),
Estage_fac = case_when(Estage == "E9.0" ~ "g1",
Estage == "E9.5" ~ "g2",
Estage == "E10" ~ "g3",
Estage == "E10.5" ~ "g3"),
Cond_fac = gsub(".*_Endo", "Endo", condition),
Cond_fac = gsub(".*Kneg", "Kneg", Cond_fac),
Cond_fac = gsub(".*_HE.*", "HE", Cond_fac),
Cond_fac = gsub(".*_EMP", "EMP", Cond_fac),
Cond_fac = gsub(".*_LMP", "LMP", Cond_fac),
Cond_fac = factor(Cond_fac, levels = c("Endo", "Kneg", "HE", "EMP", "LMP"))
)
pal.YS <- c(
rep("#4E79A7", 3), # Endo
#rep("#F28E2B", 3), # Kit Neg
rep("#D82526", 6), # Runx1 & GFi1
#rep("#69B764", 3), # Gfi1
rep("#7a0177", 2), # EMP
rep("#f768a1", 2) # LMP
)
p1 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
geom_point(size=1) +
gghighlight(use_direct_label = FALSE) +
theme_custom +
facet_wrap( Cond_fac ~ Estage_fac, ncol=3) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
scale_colour_manual(values=pal.YS) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the FACS")
p1
pdf(file.path(outDir_inHouse6_scater_outlier, "facet_eStage_pop.pdf"), height=10, width = 14)
p1
dev.off()
## quartz_off_screen
## 2
ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "facet_eStage_pop2.pdf"), device = "pdf", width = 8, height = 12, dpi = 300)
outDir_noLMP <- file.path(outDir_inHouse6_scater, "no_Platelet")
dir.create(outDir_noLMP)
## Warning in dir.create(outDir_noLMP): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS/Scater_analysis_YS_only/no_Platelet' already exists
We noticed an the LMP population split into two. Based on expression of key genes this is the Platelet popualtion, in other words contamination.
geneDir <- file.path(outDir_noLMP, "gene_exp")
dir.create(geneDir)
## Warning in dir.create(geneDir): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS/Scater_analysis_YS_only/no_Platelet/gene_exp' already
## exists
gg <- "Gp9"
expInt <- assay(se, "logcounts")[gg,]
#exp2 <- sacle(expInt)
#exp2 <- as.vector(exp2)
df_plot <- umap.df.int %>%
mutate(gene = expInt)
ggplot(df_plot, aes(x=UMAP_1, y=UMAP_2, colour=gene)) +
geom_point() +
theme_custom +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "none") +
scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
NULL
ggsave(filename = file.path(geneDir, paste0(gg, "_UMAP.png")), device = "png", width = 6, height = 4, dpi = 300)
Lets remove the contaminating cluster. We can remove it by the in silico cluster or we use the INDEX FACS data. Lets use the INDEX data and merge it
cD <- as.data.frame(colData(se))
# Join with index FACS info
dd <- cD %>%
mutate(UniqID = paste(SC_project, SC_plate, FACS_well, sep="_")) %>%
left_join(df_inx_WN_S10)
## Joining, by = "UniqID"
umap.df.int <- umap.df.int %>%
dplyr::mutate(Barcode = se$Barcode)
dd_sub <- dd %>%
dplyr::filter(condition %in% c("YS_E9.5_EMP", "YS_E10.5_EMP",
"YS_E9.5_LMP", "YS_E10.5_LMP")) %>%
#dplyr::filter(Allele == "RG") %>%
mutate(lCD127_eF450 = log10(CD127_eF450 + 200) ,
lCD127_PE = log10(CD127_PE + 200),
lCD41_PE = log10(CD41_PE+ 200) ,
lCD41_PECy7 = log10(CD41_PECy7 + 200)) %>%
mutate(lCD41_PECy7_cutOff = if_else(lCD41_PECy7 >= 4.5, "high", "low")) %>%
dplyr::left_join(dplyr::select(umap.df.int, UMAP_1, UMAP_2, Barcode))
## Joining, by = "Barcode"
Plot CD41 in UMAP
ggplot(dd_sub, aes(x=UMAP_1, y=UMAP_2, colour=lCD127_PE)) +
geom_point() +
theme_custom +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right") +
scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
NULL
ggplot(dd_sub, aes(x=UMAP_1, y=UMAP_2, colour=lCD41_PECy7)) +
geom_point() +
theme_custom +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right") +
scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
NULL
ggplot(dd_sub, aes(x=UMAP_1, y=UMAP_2, colour=lCD41_PECy7_cutOff)) +
geom_point() +
theme_custom +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right") +
#scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
NULL
Plot FACS
ggplot(dd_sub, aes(x=lCD41_PECy7, y=lCD127_PE, colour=condition)) +
geom_point() +
theme_custom
## Warning: Removed 64 rows containing missing values (geom_point).
ggplot(dd_sub, aes(x=lCD41_PECy7, y=lCD127_PE, colour=lCD41_PECy7_cutOff)) +
geom_point() +
theme_custom
## Warning: Removed 64 rows containing missing values (geom_point).
Now lets remove cells which have high CD41 based on INDEX sort
toRemove_nonLMP <- dd_sub %>%
dplyr::filter(lCD41_PECy7_cutOff == "high") %>%
pull(Barcode)
#se2 <- se
se <- se[,!se$Barcode %in% toRemove_nonLMP]
set.seed(123)
dec.se <- modelGeneVar(se)
top.se <- getTopHVGs(dec.se, n=3500)
chosen <- top.se
se <- runPCA(se, subset_row=chosen)
set.seed(123)
se <- runUMAP(se, dimred="PCA", subset_row=chosen,
pca=30,
n_neighbors=35,
min_dist=0.25,
#n_neighbors=30,
#min_dist=0.3,
ncomponents = 2)
plotUMAP(se, colour_by="condition")
Try plot 3D
umap.df <- as.data.frame(reducedDim(se, "UMAP")) %>%
cbind(colData(se)) %>%
as.data.frame()
# 3D plot
x1 <- umap.df$V1
x2 <- umap.df$V2
x3 <- umap.df$V3
df_pal <- df.col_FACS %>%
dplyr::filter(FACS %in% umap.df$condition)
pal <- df_pal$Pal
kk <- umap.df$condition
kk.1 <- unique(umap.df$condition)
kk <- factor(as.character(kk),
levels=c(
"AGM_E10.5_Endo", "AGM_E10.5_HE_Runx1", "AGM_E10.5_HE_Gfi1", "AGM_E10.5_EHT", "AGM_E10.5_IAHC",
"YS_E9.0_Endo", "YS_E9.5_Endo", "YS_E10.5_Endo",
"YS_E9.0_HE_Gfi1_Kneg", "YS_E9.5_HE_Gfi1_Kneg", "YS_E10_HE_Gfi1_Kneg",
"YS_E9.0_HE_Runx1", "YS_E9.5_HE_Runx1", "YS_E10.5_HE_Runx1",
"YS_E9.0_HE_Gfi1", "YS_E9.5_HE_Gfi1", "YS_E10.5_HE_Gfi1",
"YS_E9.5_EMP","YS_E10.5_EMP",
"YS_E9.5_LMP", "YS_E10.5_LMP"))
kk[is.na(pal[kk])] %>% unique()
pal <- pal[kk]
plot3d(x1, x2, x3, col = pal)
# Or use plotly
plot_ly(umap.df, x= ~V1, y= ~V2, z= ~V3, color = ~condition)
umap.df <- reducedDim(se, "UMAP") %>%
cbind(colData(se)) %>%
as.data.frame() %>%
dplyr::rename(UMAP_1 = V1,
UMAP_2 = V2,
UMAP_3 = V3) %>%
# Add a column to simplify the annotation
mutate(
MainPop = gsub("AGM_", "", condition),
MainPop = gsub("YS_", "", condition),
MainPop = gsub("HE_Gfi1", "HE", MainPop),
MainPop = gsub("HE_Runx1", "HE", MainPop)
)
x1 <- umap.df$UMAP_1
x2 <- umap.df$UMAP_2
x3 <- umap.df$UMAP_3
library(rgl)
pal <- df.col_FACS %>%
dplyr::filter(FACS %in% umap.df$condition) %>%
pull(Pal)
kk <- umap.df.int$condition
kk <- as.factor(as.character(kk))
pal <- pal[kk]
plot3d(x1, x2, x3, col = pal)
Cluster with h clustering
set.seed(123)
dec.se <- modelGeneVar(se)
top.se <- getTopHVGs(dec.se, n=3000)
chosen <- top.se
# Hierarch by gene exp
chosen.exprs <- assay(se, "logcounts")[chosen,]
my.dist = dist(as.matrix(t(chosen.exprs)), method = "euclidean")
my.tree <- hclust(my.dist, method="ward.D2")
my.dend <- as.dendrogram(my.tree, hang=0.1)
my.clusters <- unname(cutreeDynamic(my.tree,
distM=as.matrix(my.dist),
verbose=0, minClusterSize=10,
deepSplit=2))
#my.clusters <- paste0("k_", my.clusters)
length(unique(my.clusters))
## [1] 10
# Making a prettier dendrogram.
my.tree$labels <- seq_along(my.tree$labels)
#dend <- as.dendrogram(my.tree, hang=0.1)
numClus <- length(unique(my.clusters))
coltree <- gg_color_hue(n=numClus)
my.tree_or <- my.clusters[order.dendrogram(my.dend)]
# Setting up colours for the FACS population
cFACS <- coltree[as.factor(se$TreeCut)]
cFACS <- coltree[as.factor(se$TreeCut)[order.dendrogram(my.dend)]]
no0_unique <-
function(x) {
u_x <- unique(x)
u_x[u_x != 0]
}
clusters_numbers <- no0_unique(my.tree_or)
n_clusters <- length(clusters_numbers)
cols <- gg_color_hue(n=n_clusters)
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or, values = cols) %>%
set("labels_col", "black")
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)
UniqClus <- unique(my.tree_or)
pdf(file.path(outDir_noLMP, "Dend.pdf"),height = 4, width = 12)
plot(dend2)
colored_bars(cFACS, dend2, sort_by_labels_order = FALSE, y_shift=10)
plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)
dev.off()
## quartz_off_screen
## 2
se$TreeCut <- paste0("k_", sprintf("%02d",my.clusters))
Graph based
# Re-run clusters
g <- buildSNNGraph(se, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
se$GraphBased <- paste0("g_", sprintf("%02d",clust))
Visualise UMAP better
umap.df.int <- reducedDim(se, "UMAP") %>%
cbind(colData(se)) %>%
as.data.frame() %>%
dplyr::rename(UMAP_1 = V1,
UMAP_2 = V2) %>%
dplyr::mutate(UMAP_1 = -1 * UMAP_1)
Plot
p1 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=SeqRun)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~SeqRun) +
theme_custom +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the Sequencing batch")
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~condition) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the FACS population")
p3 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~TreeCut) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
p3.5 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~GraphBased) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
p4 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=Phase)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~Phase) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cell cycle")
png(file.path(outDir_noLMP, "facet_SeqBatch.png"), height=350, width = 850)
p1
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_noLMP, "facet_FACS.png"), height=950, width = 1200)
p2
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_noLMP, "facet_cluster_tree.png"), height=950, width = 1200)
p3
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_noLMP, "facet_cluster_graph_default.png"), height=950, width = 1200)
p3.5
dev.off()
## quartz_off_screen
## 2
png(file.path(outDir_noLMP, "facet_CC.png"), height=350, width = 850)
p4
dev.off()
## quartz_off_screen
## 2
Lets simplify the hierachical clustering
#my.clusters <- cutree(my.tree, k=3)
#my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0, minClusterSize=10, deepSplit=2))
umap.df.int <- umap.df.int %>%
dplyr::mutate(TreeCut_re = TreeCut,
TreeCut_re =
case_when(
TreeCut_re == "k_01" ~ 'k_2',
TreeCut_re == "k_02" ~ 'k_3',
TreeCut_re == "k_03" ~ 'k_2',
TreeCut_re == "k_04" ~ 'k_4',
TreeCut_re == "k_05" ~ 'k_5',
TreeCut_re == "k_06" ~ 'k_2',
TreeCut_re == "k_07" ~ 'k_1',
TreeCut_re == "k_08" ~ 'k_1',
TreeCut_re == "k_09" ~ 'k_2',
TreeCut_re == "k_10" ~ 'k_2'
))
se$TreeCut_re <- umap.df.int$TreeCut_re
Plot the UMAP
# Setting up colours for the clusters
pal.YS_clust_simple <-
c(
"#A0CBE8", # Blue
"#FFBE7D", # Orange-sh
#"#BD8A4C", # Brown / Green
"#D4A6C8", "#631879", "#A20056" # Pink shades
)
p3.5 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut_re)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~TreeCut_re) +
theme_custom +
scale_colour_manual(values=pal.YS_clust_simple) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the in-silico clusters")
png(file.path(outDir_noLMP, "facet_cluster_tree_refactor.png"), height=950, width = 1200)
p3.5
dev.off()
## quartz_off_screen
## 2
Re-draw the dendogram. Re draw as it is
clust_num <- as.integer(gsub("k_", "", umap.df.int$TreeCut_re))
clusters_numbers <- no0_unique(clust_num)
n_clusters <- length(clusters_numbers)
my.tree_or2 <-
case_when(
my.tree_or == "1" ~ '2',
my.tree_or == "2" ~ '3',
my.tree_or == "3" ~ '2',
my.tree_or == "4" ~ '4',
my.tree_or == "5" ~ '5',
my.tree_or == "6" ~ '2',
my.tree_or == "7" ~ '1',
my.tree_or == "8" ~ '1',
my.tree_or == "9" ~ '2',
my.tree_or == "10" ~ '2') %>%
as.integer()
df_order <- data.frame(
TreeLab = my.dend %>% labels(),
Clust = my.tree_or2) %>%
dplyr::arrange(Clust)
cols <- pal.YS_clust_simple[c(5, 4, 3, 1, 2)]
dend2 <- branches_attr_by_clusters(my.dend, my.tree_or2, values = cols) %>%
set("labels_col", "black") %>%
rotate(as.character(df_order$TreeLab))
plot(dend2)
df_order2 <- data.frame(
TreeLab = dend2 %>% labels()
) %>%
left_join(df_order)
## Joining, by = "TreeLab"
UniqClus <- unique(df_order2$Clust)
pdf(file.path(outDir_noLMP, "Dend_reOrder.pdf"),height = 4, width = 12)
plot(dend2)
plot(UniqClus, ylim = c(-1, length(UniqClus)))
text(x =1:length(UniqClus), y=-1, UniqClus)
dev.off()
## quartz_off_screen
## 2
We make a nicer UMAP for publication
umap.df.int <- umap.df.int %>%
dplyr::mutate(Estage = gsub("_Endo", "", condition),
Estage = gsub("_HE.*", "", Estage),
Estage = gsub("_EMP.*", "", Estage),
Estage = gsub("_LMP.*", "", Estage),
Estage = gsub("YS_", "", Estage),
Estage_fac = case_when(Estage == "E9.0" ~ "g1",
Estage == "E9.5" ~ "g2",
Estage == "E10" ~ "g3",
Estage == "E10.5" ~ "g3"),
Cond_fac = gsub(".*_Endo", "Endo", condition),
Cond_fac = gsub(".*Kneg", "Kneg", Cond_fac),
Cond_fac = gsub(".*_HE.*", "HE", Cond_fac),
Cond_fac = gsub(".*_EMP", "EMP", Cond_fac),
Cond_fac = gsub(".*_LMP", "LMP", Cond_fac),
Cond_fac = factor(Cond_fac, levels = c("Endo", "Kneg", "HE", "EMP", "LMP"))
)
pal.YS <- c(
rep("#4E79A7", 3), # Endo
#rep("#F28E2B", 3), # Kit Neg
rep("#D82526", 6), # Runx1 & GFi1
#rep("#69B764", 3), # Gfi1
rep("#7a0177", 2), # EMP
rep("#f768a1", 2) # LMP
)
p1 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=condition)) +
geom_point(size=1) +
gghighlight(use_direct_label = FALSE) +
theme_custom +
facet_wrap( Cond_fac ~ Estage_fac, ncol=3) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
scale_colour_manual(values=pal.YS) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the FACS")
p1
pdf(file.path(outDir_noLMP, "facet_eStage_pop.pdf"), height=10, width = 14)
p1
dev.off()
## quartz_off_screen
## 2
ggsave(filename = file.path(outDir_noLMP, "facet_eStage_pop2.pdf"), device = "pdf", width = 8, height = 12, dpi = 300)
Show and group clusters
umap.df.int <- umap.df.int %>%
dplyr::mutate(ClusGroup =
case_when(TreeCut == "k_02" ~ "g1",
TreeCut == "k_04" ~ "g1",
TreeCut == "k_05" ~ "g1",
TreeCut == "k_08" ~ "g2",
TreeCut == "k_03" ~ "g2",
TreeCut == "k_06" ~ "g2",
TreeCut == "k_07" ~ "g2",
TreeCut == "k_09" ~ "g3",
TreeCut == "k_01" ~ "g3",
))
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~ ClusGroup, ncol=3) +
theme_custom +
#scale_colour_manual(values=g_hue) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cluster")
pdf(file.path(outDir_noLMP, "facet_TreeCluster.pdf"), height=3, width = 6)
p2
dev.off()
Show the Graph based clustering
pal.YS_clus <- c(
"#225ea8", "#A0CBE8", # Blue
"#F28E2B", "#FFBE7D", "#F1CE63", # Orange-sh
"#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
"#D4A6C8", "#631879", "#A20056" # Pink
)
pal.YS_clus <- c(
"#225ea8", "#A0CBE8", # Blue
"#F28E2B", "#FFBE7D", # Orange-sh
"#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
"#D4A6C8", "#631879", "#A20056" # Pink
)
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=GraphBased)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
#facet_wrap(~ ClusGroup, ncol=3) +
theme_custom +
scale_colour_manual(values=pal.YS_clus) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cluster")
# Add text to the plot
clust_data <- umap.df.int %>%
group_by(GraphBased) %>%
dplyr::summarise(UMAP_1 = mean(UMAP_1),
UMAP_2 = mean(UMAP_2))
p2.1 <- p2 +
geom_point(data = clust_data, aes(fill = GraphBased),
size = 10, shape = 21, colour = "white") +
scale_fill_manual(values = pal.YS_clus) +
geom_text(data = clust_data, aes(label = GraphBased),
colour = "black") +
theme(legend.position = "none")
p2.1
ggsave(filename = file.path(outDir_noLMP, "UMAP_GraphCluster.pdf"), device = "pdf", width = 7, height = 4.5, dpi = 300)
Show the Tree cut
pal.YS_clus <- c(
"#225ea8", "#A0CBE8", # Blue
"#F28E2B", "#FFBE7D", "#F1CE63", # Orange-sh
"#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
"#D4A6C8", "#631879", "#A20056" # Pink
)
pal.YS_clus <- c(
"#225ea8", "#A0CBE8", # Blue
"#F28E2B", "#FFBE7D", # Orange-sh
"#919C4C", "#C3C377","#BD8A4C", # 6-8 (Green)
"#D4A6C8", "#631879", "#A20056" # Pink
)
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=TreeCut_re)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
#facet_wrap(~ ClusGroup, ncol=3) +
theme_custom +
scale_colour_manual(values=pal.YS_clust_simple) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cluster")
# Add text to the plot
clust_data <- umap.df.int %>%
group_by(TreeCut_re) %>%
dplyr::summarise(UMAP_1 = mean(UMAP_1),
UMAP_2 = mean(UMAP_2))
p2.1 <- p2 +
geom_point(data = clust_data, aes(fill = TreeCut_re),
size = 10, shape = 21, colour = "white") +
scale_fill_manual(values = pal.YS_clust_simple) +
geom_text(data = clust_data, aes(label = TreeCut_re),
colour = "black") +
theme(legend.position = "none")
p2.1
# 35w 25h
ggsave(filename = file.path(outDir_noLMP, "UMAP_TreeCut.pdf"), device = "pdf", width = 6, height = 4, dpi = 300)
We can show in a hull plot
p_hull_1 <-
ggplot(umap.df.int, aes(x = UMAP_1, y = UMAP_2)) +
geom_mark_hull(
aes(fill=TreeCut_re, label=TreeCut_re),
alpha=0.5,
concavity=10, expand = unit(2, "mm"), radius = unit(1, "mm")) +
geom_point(size = 0.5, colour = "grey50") +
scale_fill_manual(values = pal.YS_clust_simple) +
theme_void() +
theme(legend.position = "none") +
NULL
p_hull_1
ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "GraphCluster_hull1.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)
p_hull_2 <-
ggplot(umap.df.int, aes(x = UMAP_1, y = UMAP_2)) +
geom_mark_hull(
aes(fill=GraphBased, label=GraphBased),
alpha=0.0,
concavity=20, expand = unit(1, "mm"), radius = unit(1, "mm")) +
geom_point(size = 2, aes(colour = GraphBased)) +
scale_colour_manual(values = pal.YS_clus) +
theme_void() +
theme(legend.position = "none") +
NULL
p_hull_2
ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "GraphCluster_hull2.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)
Show proportion as density plot (FACS like)
p1 <-
ggplot(
umap.df.int,
#dplyr::filter(umap.df.int, TreeCut_re == "k_2"),
aes(UMAP_1, UMAP_2)) +
#geom_density_2d(aes(colour = Estage)) +
geom_density_2d_filled(contour_var = "ndensity") +
facet_wrap(TreeCut_re ~ Estage, ncol=3) +
theme_custom +
NULL
umap.df.int_tmp <- umap.df.int %>%
dplyr::mutate(TreeCut_re = as.character(TreeCut_re),
Estage = as.character(Estage)) %>%
dplyr::mutate(Estage =
case_when(
TreeCut_re %in% c("k_1", "k_2","k_3", "k_4", "k_5") ~ "E1",
TRUE ~ Estage))
p2 <-
ggplot(
umap.df.int_tmp,
#dplyr::filter(umap.df.int, TreeCut_re == "k_2"),
aes(UMAP_1, UMAP_2)) +
#geom_density_2d(aes(colour = Estage)) +
geom_density_2d_filled(contour_var = "ndensity") +
facet_wrap(TreeCut_re ~ Estage, ncol=3) +
theme_custom +
geom_mark_hull(
aes(fill=TreeCut_re, label=TreeCut_re),
alpha=0.5,
concavity=10, expand = unit(2, "mm"), radius = unit(1, "mm")) +
#scale_colour_manual(values = pal.YS_clus) +
NULL
pdf(file.path(outDir_noLMP, "Density_Estage.pdf"), width = 9, height = 7)
p1
p2
dev.off()
## quartz_off_screen
## 2
Show proportion of cells in each cluster
meta = umap.df.int %>%
dplyr::mutate(
Cond_simple = gsub("HE_Gfi1", "HE", condition),
Cond_simple = gsub("HE_Runx1", "HE", Cond_simple),
Cond_simple = gsub("HE_Kneg", "HE_Gfi1_Kneg", Cond_simple))
tmp.pal <- df.col_FACS_simp %>%
dplyr::filter(FACS %in% meta$Cond_simple) #%>%
#pull(Pal)
meta <- meta %>%
dplyr::mutate(Cond_simple = factor(Cond_simple, levels=tmp.pal$FACS))
pal.YS <- c(
rep("#4E79A7", 3), # Endo
#rep("#F28E2B", 3), # Kit Neg
rep("#D82526", 3), # Runx1 & GFi1
#rep("#69B764", 3), # Gfi1
rep("#7a0177", 2), # EMP
rep("#f768a1", 2) # LMP
)
# Proportion by FACS
stat1 <-
meta %>%
dplyr::group_by(TreeCut_re, Cond_simple) %>%
dplyr::summarise(count=n()) %>%
dplyr::mutate(perc=(count/sum(count)*100)) %>%
dplyr::mutate(eDay = gsub("YS_", "", Cond_simple),
eDay = gsub("_.*", "", eDay),
eDay = factor(eDay, levels = c("E9.0", "E9.5", "E10", "E10.5")))
## `summarise()` has grouped output by 'TreeCut_re'. You can override using the `.groups` argument.
ggplot(stat1, aes(x = TreeCut_re, y = perc, fill = Cond_simple,
colour=eDay, linetype=eDay,
label = round(perc, 0))) +
geom_bar(stat="identity", width = 0.5) +
geom_text(size = 4, position = position_stack(vjust = 0.5)) +
#scale_fill_manual(values = tmp.pal$Pal) +
scale_fill_manual(values = pal.YS) +
scale_colour_manual(values=c("black", "grey20", "grey50", "black")) +
scale_linetype_manual(values=c("blank", "solid", "solid", "longdash")) +
labs(x="Cluster", y="Percentage", fill="FACS") +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
strip.background = element_rect(fill="white", colour = "white"),
legend.position = "right") +
#coord_flip() +
NULL
# 75w 20h if flip, opposite if not
#ggsave(filename = file.path(outDir_inHouse6_scater_outlier, "BarChart_cluster.pdf"), device = "pdf", width = 12, height = 5, dpi = 300)
ggsave(filename = file.path(outDir_noLMP, "BarChart_cluster.pdf"), device = "pdf", width = 6, height = 9, dpi = 300)
Show proportion of cells in each FACS
#meta = umap.df.int
tmp.pal <- df.col_FACS_simp %>%
dplyr::filter(FACS %in% meta$Cond_simple) %>%
pull(Pal)
# Proportion by FACS
stat1 <-
meta %>%
#dplyr::filter(TreeCut_re == "k_2") %>%
dplyr::group_by(Cond_simple, TreeCut_re) %>%
dplyr::summarise(count=n()) %>%
dplyr::mutate(perc=(count/sum(count)*100)) %>%
dplyr::mutate(eDay = gsub("YS_", "", Cond_simple),
eDay = gsub("_.*", "", eDay),
eDay = factor(eDay, levels = c("E9.0", "E9.5", "E10", "E10.5")))
## `summarise()` has grouped output by 'Cond_simple'. You can override using the `.groups` argument.
ggplot(stat1, aes(x = Cond_simple, y = perc, fill = TreeCut_re,
#colour=condition,
label = round(perc, 0))) +
geom_bar(stat="identity", width = 0.5) +
geom_text(size = 4, position = position_stack(vjust = 0.5)) +
#scale_fill_manual(values = tmp.pal) +
scale_fill_manual(values = pal.YS_clust_simple) +
scale_colour_manual(values=c(pal.YS)) +
#scale_linetype_manual(values=c("blank", "solid", "solid", "longdash")) +
labs(x="Cluster", y="Percentage", fill="FACS") +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
strip.background = element_rect(fill="white", colour = "white"),
axis.text.x= element_text(angle = 45),
legend.position = "right") +
#coord_flip() +
NULL
# 75w 20h if flip, opposite if not
ggsave(filename = file.path(outDir_noLMP, "BarChart_FACS.pdf"), device = "pdf", width = 12, height = 9, dpi = 300)
Proportion as bar chart, not stacked
ggplot(stat1, aes(x = TreeCut_re, y = perc, fill = TreeCut_re,
label = round(perc, 0))) +
geom_bar(stat="identity", width = 0.5) +
facet_wrap(~Cond_simple) +
#geom_text(size = 4, position = position_stack(vjust = 0.5)) +
geom_text(size = 4) +
scale_fill_manual(values = pal.YS_clust_simple) +
labs(x="Cluster", y="Percentage", fill="Cluster") +
theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
strip.background = element_rect(fill="white", colour = "white"),
legend.position = "right") +
#coord_flip() +
NULL
ggsave(filename = file.path(outDir_noLMP, "BarChart_facet_FACS.pdf"), device = "pdf", width = 12, height = 8, dpi = 300)
We stick with Tree based clusters
We find the marker gene in each clusters. Seurat is a more suitable approach to find genes associated with clusters
inhouse.combined <- merge(seuset_nova, y = seuset_next, add.cell.ids = c("nova", "next"), project = "inhouse")
Do normal processing
seuset <- inhouse.combined
seuset <- seuset[,seuset$Barcode %in% se$Barcode ]
# Match the order
dd <- seuset[[]]
cD <- as.data.frame(colData(se)) %>%
dplyr::select(Barcode,TreeCut_re, GraphBased )
dd <- dd %>%
dplyr::left_join(cD)
## Joining, by = "Barcode"
seuset$GraphBased <- factor(dd$GraphBased, levels=paste0("g_", sprintf("%02d",1:length(unique(dd$GraphBased)))))
seuset$TreeCut_re <- factor(dd$TreeCut_re, levels=paste0("k_", sprintf("%01d",1:length(unique(dd$TreeCut_re)))))
Do normal processing
seuset <- NormalizeData(seuset)
# Feature selections
seuset <-
FindVariableFeatures(object = seuset, selection.method = "vst", nfeatures = 2500)
# Scaling the data
all.genes <- rownames(x = seuset)
seuset <- CellCycleScoring(seuset, s.features = s.genes, g2m.features = g2m.genes, set.ident = FALSE)
## Warning: The following features are not present in the object: MCM5, PCNA,
## TYMS, FEN1, MCM7, MCM4, RRM1, UNG, GINS2, MCM6, CDCA7, DTL, PRIM1, UHRF1, CENPU,
## HELLS, RFC2, POLR1B, NASP, RAD51AP1, GMNN, WDR76, SLBP, CCNE2, UBR7, POLD3,
## MSH2, ATAD2, RAD51, RRM2, CDC45, CDC6, EXO1, TIPIN, DSCC1, BLM, CASP8AP2, USP1,
## CLSPN, POLA1, CHAF1B, MRPL36, E2F8, not searching for symbol synonyms
## Warning: The following features are not present in the object: HMGB2, CDK1,
## NUSAP1, UBE2C, BIRC5, TPX2, TOP2A, NDC80, CKS2, NUF2, CKS1B, MKI67, TMPO, CENPF,
## TACC3, PIMREG, SMC4, CCNB2, CKAP2L, CKAP2, AURKB, BUB1, KIF11, ANP32E, TUBB4B,
## GTSE1, KIF20B, HJURP, CDCA3, JPT1, CDC20, TTK, CDC25C, KIF2C, RANGAP1, NCAPD2,
## DLGAP5, CDCA2, CDCA8, ECT2, KIF23, HMMR, AURKA, PSRC1, ANLN, LBR, CKAP5, CENPE,
## CTCF, NEK2, G2E3, GAS2L3, CBX5, CENPA, not searching for symbol synonyms
## Warning in AddModuleScore(object = object, features = features, name = name, :
## Could not find enough features in the object from the following feature lists:
## S.Score Attempting to match case...Could not find enough features in the object
## from the following feature lists: G2M.Score Attempting to match case...
# Add either originated from nova or next
Seq_name <- colnames(seuset)
Seq_name <- gsub("_.*", "", Seq_name)
seuset$Sequencer <- Seq_name
# Dont need to regress anything at this stage as we only want to remove the tech rep and outlier
seuset <- ScaleData(seuset,
#vars.to.regress = c("S.Score", "G2M.Score", "Sequencer"),
features = rownames(seuset))
## Centering and scaling data matrix
seuset <- RunPCA(seuset, verbose = FALSE)
seuset <- RunUMAP(seuset, dims = 1:30, verbose = FALSE, min.dist = 0.4)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
seuset <- FindNeighbors(seuset, dims = 1:30, verbose = FALSE)
seuset <- FindClusters(seuset, verbose = FALSE)
Show the UMAP plot
DimPlot(seuset)
ggsave(filename = file.path(outDir_noLMP, "Seurat_UMAP_clusterSeu.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)
Another one
Idents(seuset) <- seuset$TreeCut_re
DimPlot(seuset)
ggsave(filename = file.path(outDir_noLMP, "Seurat_UMAP_clusterTreeCut.pdf"), device = "pdf", width = 7, height = 5, dpi = 300)
dd2 <- dd %>%
mutate(seurat_clusters = seuset$seurat_clusters)
row.names(dd2) <- dd2$Barcode
dd2 <- dd2[se$Barcode,]
se$seurat_clusters <- dd2$seurat_clusters
Plot
umap.df.int$seurat_clusters <- dd2$seurat_clusters
p2 <-
ggplot(umap.df.int, aes(x=UMAP_1, y=UMAP_2, colour=seurat_clusters)) +
geom_point() +
gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
#facet_wrap(~ ClusGroup, ncol=3) +
theme_custom +
scale_colour_manual(values=pal.YS_clus) +
#scale_colour_manual(values=tableau20) +
labs(title="Highlighting the cluster")
# Add text to the plot
clust_data <- umap.df.int %>%
group_by(seurat_clusters) %>%
dplyr::summarise(UMAP_1 = mean(UMAP_1),
UMAP_2 = mean(UMAP_2))
p2.1 <- p2 +
geom_point(data = clust_data, aes(fill = seurat_clusters),
size = 10, shape = 21, colour = "white") +
scale_fill_manual(values = pal.YS_clus) +
geom_text(data = clust_data, aes(label = seurat_clusters),
colour = "black") +
theme(legend.position = "none")
p2.1
ggsave(filename = file.path(outDir_noLMP, "UMAP_SeuratCluster.pdf"), device = "pdf", width = 7, height = 6, dpi = 300)
Find marker genes
su.markers <-
FindAllMarkers(seuset, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster k_1
## Calculating cluster k_2
## Calculating cluster k_3
## Calculating cluster k_4
## Calculating cluster k_5
su.markers <- su.markers %>%
dplyr::mutate(isCS = ifelse(gene %in% df_cs$gene_name, "CS", "notCS"))
write.csv(su.markers, file.path(outDir_noLMP, "SeuratMarkers_TreeCut.csv"))
Plot a dot plot of the top genes
top10 <- su.markers %>% dplyr::group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
pdf(file.path(outDir_noLMP, "DotPlot.pdf"), height = 12, width = 9)
DotPlot(seuset,features=unique(top10$gene)) + coord_flip() +
theme(axis.text.y = element_text(size = 9)) +
scale_colour_gradient2(low = "#FF00FF", mid = "#000000", high = "#FFFF00")
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
dev.off()
## quartz_off_screen
## 2
pdf(file.path(outDir_noLMP, "Heatmap.pdf"), height = 12, width = 15)
DoHeatmap(seuset,features=unique(top10$gene), raster = FALSE)
#theme(axis.text.y = element_text(size = 9)) +
#scale_colour_gradient2(low = "#FF00FF", mid = "#000000", high = "#FFFF00")
dev.off()
## quartz_off_screen
## 2
Specific DE
su.markers2 <-
FindMarkers(seuset, only.pos = FALSE,
ident.1 = c("k_4"), # EMP
ident.2 = c("k_5"), # LMP
min.pct = 0.25, logfc.threshold = 0.25) %>%
tibble::rownames_to_column(var = "gene") %>%
dplyr::mutate(isCS = ifelse(gene %in% df_cs$gene_name, "CS", "notCS"))
write.csv(su.markers2, file.path(outDir_noLMP, "SeuratMarkers_k4_EMP_vs_k6_LMP.csv"))
Lets view the expression of selected genes
# Plot kit expression
plotUMAP(se, colour_by="Kit") +
scale_fill_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 0.9)
se$Cond_fac <- umap.df.int$Cond_fac
expInt <- assay(se, "logcounts")["Kit",]
m <- as.data.frame(expInt)
df_plot <- umap.df.int %>%
mutate(Gene = m$expInt) %>%
mutate(xLab = paste(Estage_fac, Cond_fac, sep="_"),
xLab = factor(xLab, levels=
c("g1_Endo", "g2_Endo", "g3_Endo",
#"g1_Kneg", "g2_Kneg", "g3_Kneg",
"g1_HE", "g2_HE", "g3_HE",
"g2_EMP", "g3_EMP",
"g2_LMP", "g3_LMP"
)))
pal.YS <- c(
rep("#4E79A7", 3), # Endo
#rep("#F28E2B", 3), # Kit Neg
rep("#D82526", 3), # Runx1 & GFi1
#rep("#69B764", 3), # Gfi1
rep("#7a0177", 2), # EMP
rep("#f768a1", 2) # LMP
)
p_viol_gene <-
df_plot %>%
ggplot(aes(x = xLab, y = Gene, fill = Cond_fac)) +
geom_violin(alpha = 0.5, scale = "width", position = position_dodge(width = 0.9)) +
geom_boxplot(alpha = 0.5, width = 0.2, position = position_dodge(width = 0.9)) +
scale_fill_manual(values=unique(pal.YS)) +
theme_bw() +
theme(legend.position = "none",
panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = c(0,12),
breaks = c(0,5,10)) +
xlab("") +
NULL
pdf(file.path(outDir_noLMP, "Kit_exp.pdf"),height = 3, width = 6)
p_viol_gene
dev.off()
## quartz_off_screen
## 2
Nicer UMAP plot
geneDir <- file.path(outDir_noLMP, "gene_exp")
dir.create(geneDir)
## Warning in dir.create(geneDir): '/Users/zfadlullah/Dropbox (The University
## of Manchester)/zaki/PhD/sequencing_runs/2021/2021_06_01_MZ_016/output/
## Chapter_03_InHouse_YS/Scater_analysis_YS_only/no_Platelet/gene_exp' already
## exists
gg <- "Kit"
expInt <- assay(se, "logcounts")[gg,]
#exp2 <- sacle(expInt)
#exp2 <- as.vector(exp2)
df_plot <- umap.df.int %>%
mutate(gene = expInt)
ggplot(df_plot, aes(x=UMAP_1, y=UMAP_2, colour=gene)) +
geom_point() +
theme_custom +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "none") +
scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
NULL
ggsave(filename = file.path(geneDir, paste0(gg, "_UMAP.png")), device = "png", width = 6, height = 4, dpi = 300)
We show violin plot of mulitple genes
e make a violin plot of genes expressed in each cluster
geneToPlot <- c(
"Runx1",
"Gfi1",
"Gfi1b",
"Itga2b",
"Itgb3",
"Ptprc",
"Epor",
"Mpl",
"Rgs18",
"Pf4",
"Mpo",
"Il7r")
geneToPlot <- c(
"Lyve1",
"Runx1",
"Gfi1",
"Gfi1b",
#"Rgs18",
"Itga2b",
"Itgb3",
"Ptprc",
"Il7r",
"Ighm",
"Epor",
"Mpl",
"Pf4")
Lets try to make a nice violin plot of these genes
setmp <- se
setmp$finalCluster2 <- setmp$TreeCut_re
pal <- pal.YS_clust_simple
expInt <- assay(setmp, "logcounts")[geneToPlot,]
m <- as.data.frame(expInt)
colnames(m) <- setmp$Barcode
m$gene <- row.names(m)
m <- melt(m)
## Using gene as id variables
m$gene <- factor(m$gene, levels=geneToPlot)
dftmp <- as.data.frame(colData(setmp))
cD_sub <- dftmp %>%
dplyr::select(Barcode, finalCluster2) # %>%
#dplyr::filter(cell_cycle %in% "G1")
m <- m %>%
dplyr::left_join(cD_sub, by = c("variable" = "Barcode"))
# Plot
p_viol_gene <-
m %>%
ggplot(aes(x = finalCluster2, y = value, colour = finalCluster2)) +
geom_violin(fill=NA, scale="width") +
geom_quasirandom(groupOnX=TRUE, size = 0.8, alpha = 0.5, width = 0.4) +
stat_summary(fun.y = mean, fun.ymin = mean, fun.ymax = mean,
geom = "crossbar", width = 0.3, alpha = 0.8, colour="black") +
facet_grid(gene ~ .) +
scale_colour_manual(values=pal) +
theme_bw() +
theme(legend.position = "none",
panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = c(0,15),
breaks = c(0,5,10, 15)) +
xlab("") +
NULL
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.
# 45w 77h
ggsave(filename = file.path(outDir_noLMP, "SelectedGenes.pdf"), device = "pdf", width = 6, height = 14, dpi = 300)
If we want to show numb of expressing cells
# Find number of cells expressing in each group and write it on the top of the plot
df_g <- m %>%
group_by(finalCluster2, gene) %>%
dplyr::summarise(Ncell = n(),
Nexp = sum( value >0 )) %>%
dplyr::mutate(PecExp = round(Nexp / Ncell * 100, 1),
#Text = paste0(Nexp, " (", PecExp, "%", ")"))
Text = paste0(" (", PecExp, "%", ")"))
#Text = paste0(Nexp, "/", Ncell, "=", PecExp, "%"))
p_viol_gene2 <-
m %>%
ggplot(aes(x = finalCluster2, y = value, colour = finalCluster2)) +
geom_violin(fill=NA, scale="width") +
geom_quasirandom(groupOnX=TRUE, size = 2.2, alpha = 0.7, width = 0.4) +
stat_summary(fun.y = mean, fun.ymin = mean, fun.ymax = mean,
geom = "crossbar", width = 0.3, alpha = 0.8, colour="black") +
facet_grid(gene ~ .) +
scale_colour_manual(values=pal) +
theme_bw() +
theme(legend.position = "none",
panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(limits = c(0,15),
breaks = c(0,5,10, 15)) +
xlab("") +
xlab("") +
geom_text(data=df_g ,
aes(x = finalCluster2, y = max(m$value + 1), label=Text),
color="black", size = 4) +
NULL
pdf(file.path(outDir_DE, "MarkerGenes_perCluster_v4.pdf"), height = 21.6, width = 19.6)
p_viol_gene2
dev.off()
We want to see if we can say antyhing about the index sort values. Subset to only the interested YS populations
cD <- as.data.frame(colData(se))
# Join with index FACS info
dd <- cD %>%
mutate(UniqID = paste(SC_project, SC_plate, FACS_well, sep="_")) %>%
left_join(df_indx)
# Add a column to simplify the annotation
dd <- dd %>%
mutate(
MainPop = gsub("AGM_", "", condition),
MainPop = gsub("YS_", "", condition),
MainPop = gsub("HE_Gfi1", "HE", MainPop),
MainPop = gsub("HE_Runx1", "HE", MainPop)
)
umap.df.int <- umap.df.int %>%
dplyr::mutate(Barcode = se$Barcode)
dd_sub <- dd %>%
dplyr::filter(condition %in% c("YS_E9.0_HE_Runx1", "YS_E9.5_HE_Runx1", "YS_E10.5_HE_Runx1",
"YS_E9.0_HE_Gfi1", "YS_E9.5_HE_Gfi1", "YS_E10.5_HE_Gfi1")) %>%
#dplyr::filter(Allele == "RG") %>%
mutate(lRUNX1b = log10(Runx1b_RFP + 200) ,
lGfi1 = log10(Gfi1_GFP+ 200) ) %>%
mutate(MainPop = factor(MainPop, levels=c("E9.0_HE", "E9.5_HE", "E10.5_HE"))) %>%
dplyr::left_join(dplyr::select(umap.df.int, UMAP_1, UMAP_2, Barcode))
Plot RFP in UMAP
gg <- "RFP"
expInt <- data.frame(
gene = assay(se, "logcounts")[gg,],
Barcode = se$Barcode)
df_plot <- dd_sub %>%
left_join(expInt)
ggplot(df_plot, aes(x=UMAP_1, y=UMAP_2, colour=gene)) +
geom_point() +
theme_custom +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "none") +
scale_colour_viridis(option = "plasma", na.value = "grey80", begin = 0, end = 1) +
NULL
p2 <-
ggplot(dd_sub, aes(y=(lRUNX1b), x=(lGfi1), colour=Allele)) +
geom_point(alpha = 0.5, size=3) +
#gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~MainPop) +
theme_custom +
geom_vline(xintercept = 3.5, linetype = "dashed") +
geom_hline(yintercept = 3, linetype = "dashed") +
#scale_colour_manual(values=c("#00B050", "#FF006D")) +
scale_colour_manual(values=c("black", "firebrick")) +
NULL
pdf(file.path(outDir_inHouse6_scater_outlier, "INDEX_corelatiion.pdf"), height=4, width = 9)
p2
dev.off()
Extract the cells with really high Gfi1 or Runx1
dd_sub <- dd_sub %>%
dplyr::mutate(FACSgate = case_when(
lRUNX1b >= 3.5 & lGfi1 >= 4 & condition == "YS_E10.5_HE_Gfi1" ~ "High",
lRUNX1b <= 3.2 & lGfi1 <= 3.2 ~ "Low",
TRUE ~ "notAssigned"))
ggplot(dd_sub, aes(y=(lRUNX1b), x=(lGfi1), colour=FACSgate)) +
geom_point(alpha = 0.5, size=3) +
#gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~MainPop) +
theme_custom +
geom_vline(xintercept = 4, linetype = "dashed") +
geom_hline(yintercept = 3.5, linetype = "dashed") +
geom_vline(xintercept = 3.2, linetype = "dashed", colour="red") +
geom_hline(yintercept = 3.2, linetype = "dashed", colour="red") +
NULL
ggplot(dd_sub, aes(y=(lRUNX1b), x=(lGfi1), colour=condition)) +
geom_point(alpha = 0.5, size=3) +
#gghighlight(use_direct_label = FALSE) +
theme(legend.position = "right") +
facet_wrap(~MainPop) +
theme_custom +
geom_vline(xintercept = 4, linetype = "dashed") +
geom_hline(yintercept = 3.5, linetype = "dashed") +
geom_vline(xintercept = 3.2, linetype = "dashed", colour="red") +
geom_hline(yintercept = 3.2, linetype = "dashed", colour="red") +
NULL
Plot where they are in UMAP
dd_sub2 <- dd_sub %>%
dplyr::select(Barcode, FACSgate)
dd2 <- dd %>%
left_join(dd_sub2)
dd_dim <- reducedDim(se, "UMAP")
colnames(dd_dim) <- c("UMAP_1", "UMAP_2")
dd2 <- cbind(dd2, dd_dim)
ggplot(dd2, aes(x=UMAP_1, y=UMAP_2, colour=FACSgate)) +
geom_point(size=3) +
theme_custom +
NULL
Save the seObject with the cluster information
# Cells to exclude
toRemove <- c(toRemove, toRemove_nonLMP)
saveRDS(se, file.path(outData_dir, "se_YS_only_fullInfo.RDS"))
write.csv(umap.df.int, file.path(outDir_noLMP, "Metadata_YS_only.csv"))
saveRDS(toRemove, file.path(outData_dir, "Cells_to_exclude.csv"))
# Full image
save.image(file=file.path(outDir_noLMP, "Full_image.Rdata"))
#load(file=file.path(outDir_noLMP, "Full_image.Rdata"))
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dynamicTreeCut_1.63-1 dendextend_1.15.1
## [3] scran_1.20.1 paletteer_1.3.0
## [5] ggforce_0.3.3 viridis_0.6.1
## [7] viridisLite_0.4.0 ggrepel_0.9.1
## [9] gghighlight_0.3.2 ggbeeswarm_0.6.0
## [11] reshape2_1.4.4 dplyr_1.0.6
## [13] patchwork_1.1.1 SeuratObject_4.0.4
## [15] Seurat_4.0.6 scater_1.20.1
## [17] ggplot2_3.3.4 scuttle_1.2.0
## [19] SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
## [21] Biobase_2.52.0 GenomicRanges_1.44.0
## [23] GenomeInfoDb_1.28.0 IRanges_2.26.0
## [25] S4Vectors_0.30.0 BiocGenerics_0.38.0
## [27] MatrixGenerics_1.4.0 matrixStats_0.59.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 reticulate_1.20
## [3] tidyselect_1.1.1 htmlwidgets_1.5.3
## [5] grid_4.1.0 BiocParallel_1.26.0
## [7] Rtsne_0.15 munsell_0.5.0
## [9] ScaledMatrix_1.0.0 codetools_0.2-18
## [11] ica_1.0-2 statmod_1.4.36
## [13] future_1.21.0 miniUI_0.1.1.1
## [15] withr_2.4.2 colorspace_2.0-1
## [17] highr_0.9 knitr_1.33
## [19] ROCR_1.0-11 tensor_1.5
## [21] listenv_0.8.0 labeling_0.4.2
## [23] GenomeInfoDbData_1.2.6 polyclip_1.10-0
## [25] farver_2.1.0 parallelly_1.26.0
## [27] vctrs_0.3.8 generics_0.1.0
## [29] xfun_0.24 R6_2.5.1
## [31] rsvd_1.0.5 locfit_1.5-9.4
## [33] isoband_0.2.4 concaveman_1.1.0
## [35] bitops_1.0-7 spatstat.utils_2.2-0
## [37] DelayedArray_0.18.0 assertthat_0.2.1
## [39] promises_1.2.0.1 scales_1.1.1
## [41] beeswarm_0.4.0 gtable_0.3.0
## [43] beachmat_2.8.0 globals_0.14.0
## [45] goftest_1.2-2 rlang_0.4.11
## [47] splines_4.1.0 lazyeval_0.2.2
## [49] spatstat.geom_2.2-0 prismatic_1.0.0
## [51] yaml_2.2.1 abind_1.4-5
## [53] httpuv_1.6.1 tools_4.1.0
## [55] ellipsis_0.3.2 spatstat.core_2.2-0
## [57] jquerylib_0.1.4 RColorBrewer_1.1-2
## [59] ggridges_0.5.3 Rcpp_1.0.7
## [61] plyr_1.8.6 sparseMatrixStats_1.4.0
## [63] zlibbioc_1.38.0 purrr_0.3.4
## [65] RCurl_1.98-1.3 rpart_4.1-15
## [67] deldir_0.2-10 pbapply_1.4-3
## [69] cowplot_1.1.1 zoo_1.8-9
## [71] cluster_2.1.2 magrittr_2.0.1
## [73] data.table_1.14.0 RSpectra_0.16-0
## [75] scattermore_0.7 lmtest_0.9-38
## [77] RANN_2.6.1 fitdistrplus_1.1-5
## [79] mime_0.10 evaluate_0.14
## [81] xtable_1.8-4 gridExtra_2.3
## [83] compiler_4.1.0 tibble_3.1.2
## [85] KernSmooth_2.23-20 V8_3.4.2
## [87] crayon_1.4.1 htmltools_0.5.1.1
## [89] mgcv_1.8-35 later_1.2.0
## [91] tidyr_1.1.3 DBI_1.1.1
## [93] tweenr_1.0.2 MASS_7.3-54
## [95] Matrix_1.3-3 metapod_1.0.0
## [97] igraph_1.2.6 pkgconfig_2.0.3
## [99] plotly_4.9.4.1 spatstat.sparse_2.0-0
## [101] vipor_0.4.5 bslib_0.2.5.1
## [103] dqrng_0.3.0 XVector_0.32.0
## [105] stringr_1.4.0 digest_0.6.27
## [107] sctransform_0.3.2 RcppAnnoy_0.0.18
## [109] spatstat.data_2.1-0 rmarkdown_2.9
## [111] leiden_0.3.8 uwot_0.1.10
## [113] edgeR_3.34.0 DelayedMatrixStats_1.14.0
## [115] curl_4.3.1 shiny_1.6.0
## [117] lifecycle_1.0.0 nlme_3.1-152
## [119] jsonlite_1.7.2 BiocNeighbors_1.10.0
## [121] limma_3.48.0 fansi_0.5.0
## [123] pillar_1.6.1 lattice_0.20-44
## [125] fastmap_1.1.0 httr_1.4.2
## [127] survival_3.2-11 glue_1.4.2
## [129] FNN_1.1.3 png_0.1-7
## [131] bluster_1.2.1 stringi_1.6.2
## [133] sass_0.4.0 rematch2_2.1.2
## [135] BiocSingular_1.8.1 irlba_2.3.3
## [137] future.apply_1.7.0